{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fall / Near Fall / ADL Classification"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys, os\n",
    "from pyspark.sql import SparkSession, types, functions\n",
    "from pyspark import SparkConf, SparkContext\n",
    "import sys\n",
    "from pyspark.ml import Pipeline\n",
    "from pyspark.ml.feature import VectorAssembler, StringIndexer, SQLTransformer\n",
    "from pyspark.ml.classification import NaiveBayes\n",
    "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
    "\n",
    "from sklearn.cross_validation import train_test_split\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier\n",
    "from sklearn.ensemble import VotingClassifier\n",
    "from sklearn import ensemble\n",
    "from sklearn.feature_selection import VarianceThreshold, RFE\n",
    "from sklearn.metrics import accuracy_score, confusion_matrix\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "spark = SparkSession.builder.appName('Big Data 2 Project').getOrCreate()\n",
    "sc = spark.sparkContext\n",
    "#assert sys.version_info >= (3, 4)  # make sure we have Python 3.4+\n",
    "#assert spark.version >= '2.2'  # make sure we have Spark 2.2+\n",
    "data=pd.read_csv('Class3D.csv',header='infer')\n",
    "df=data.drop('Unnamed: 0',1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(Rows, Columns) in training dataset (480, 135)\n",
      "Number of Features in training dataset:- 135\n",
      "Number of Features in testing dataset:- 135\n"
     ]
    }
   ],
   "source": [
    "train=df.iloc[0:480,:]\n",
    "test=df.iloc[480:,:]\n",
    "print(\"(Rows, Columns) in training dataset\",train.shape)\n",
    "print(\"Number of Features in training dataset:-\",train.shape[1])\n",
    "print(\"Number of Features in testing dataset:-\",test.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#Separating training data Features and Predictions into different dataframes\n",
    "X_train=train.drop('class_label',axis=1)\n",
    "Y_train=train['class_label']\n",
    "\n",
    "#Separating testing data Features and Predictions into different dataframes\n",
    "X_test=test.drop('class_label',axis=1)\n",
    "Y_test=test['class_label']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Number of Features in training dataset(before feature selection by Variance Threshold):- 134\n",
      "Number of Features in training dataset(after feature selection by Variance Threshold):- 126\n"
     ]
    }
   ],
   "source": [
    "#Performing Feature Selection on Training Dataset\n",
    "print(\"Number of Features in training dataset(before feature selection by Variance Threshold):-\",X_train.shape[1])\n",
    "X_train_temp = X_train.copy(deep=True)  # Make a deep copy of the Training Data dataframe\n",
    "selector = VarianceThreshold(0.12)\n",
    "selector.fit(X_train_temp)\n",
    "X_res = X_train_temp.loc[:, selector.get_support(indices=False)]\n",
    "X_train=X_res\n",
    "print(\"Number of Features in training dataset(after feature selection by Variance Threshold):-\",X_train.shape[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Find most important features relative to target through correlation\n"
     ]
    }
   ],
   "source": [
    "traini = pd.concat([X_train, Y_train], axis=1, join='inner')\n",
    "\n",
    "# Find most important features relative to target i.e finding correlation of every individual feature i.e independent variable with dependent variable and then sorting them and using the features that have maximum correlation\n",
    "print(\"Find most important features relative to target through correlation\")\n",
    "corr = traini.corr()\n",
    "corr.sort_values([\"class_label\"], ascending = False, inplace = True)\n",
    "\n",
    "#Selecting top-20 features\n",
    "top_features=corr.class_label[1:10].to_frame()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['mean_lthighMagneticFieldY', 'mean_category', 'mean_sternumAccelerationX', 'mean_headAccelerationX', 'mean_waistMagneticFieldY', 'mean_waistAccelerationX', 'mean_sternumMagneticFieldY', 'mean_lankleMagneticFieldZ', 'mean_sternumMagneticFieldX']\n"
     ]
    }
   ],
   "source": [
    "features=[]\n",
    "columns=top_features.index\n",
    "for col in columns:\n",
    "    features.append(col)\n",
    "print(features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9\n",
      "Index(['mean_lthighMagneticFieldY', 'mean_category',\n",
      "       'mean_sternumAccelerationX', 'mean_headAccelerationX',\n",
      "       'mean_waistMagneticFieldY', 'mean_waistAccelerationX',\n",
      "       'mean_sternumMagneticFieldY', 'mean_lankleMagneticFieldZ',\n",
      "       'mean_sternumMagneticFieldX'],\n",
      "      dtype='object')\n",
      "9\n",
      "Index(['mean_lthighMagneticFieldY', 'mean_category',\n",
      "       'mean_sternumAccelerationX', 'mean_headAccelerationX',\n",
      "       'mean_waistMagneticFieldY', 'mean_waistAccelerationX',\n",
      "       'mean_sternumMagneticFieldY', 'mean_lankleMagneticFieldZ',\n",
      "       'mean_sternumMagneticFieldX'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "#Selecting Features for training dataset\n",
    "X_train=X_train[features]\n",
    "print(X_train.shape[1])\n",
    "print(X_train.columns)\n",
    "\n",
    "#Selecting Same Features for testing dataset\n",
    "X_test=X_test[X_train.columns]\n",
    "print(X_test.shape[1])\n",
    "print(X_test.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Seehra\\Anaconda3\\lib\\site-packages\\ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "X_train['class_label']=Y_train\n",
    "X_test['class_label']=Y_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.ml.classification import GBTClassifier,RandomForestClassifier\n",
    "df_train=spark.createDataFrame(X_train)\n",
    "df_test=spark.createDataFrame(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "prediction_Col_Name = \"prediction\"\n",
    "vecAssembler = VectorAssembler(inputCols=df_train.schema.names[:-1],\n",
    "                                       outputCol=\"features\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#nb=NaiveBayes(featuresCol=\"features\", labelCol=\"class_label\",predictionCol=prediction_Col_Name)\n",
    "gbt=GBTClassifier(featuresCol='features',labelCol='class_label',maxDepth=20)\n",
    "rf=RandomForestClassifier(featuresCol='features',labelCol='class_label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "GB_Classifier = [vecAssembler, rf]\n",
    "\n",
    "models = [('GB Classifier', Pipeline(stages=GB_Classifier))]\n",
    "\n",
    "evaluator = MulticlassClassificationEvaluator(predictionCol=\"prediction\",labelCol=\"class_label\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "for label, pipeline in models:\n",
    "    model = pipeline.fit(df_train)\n",
    "    pred = model.transform(df_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "pred_cols=pred.select('prediction','class_label')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "from pyspark.sql import types, functions\n",
    "pred_cols.createOrReplaceTempView('metric')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "tp=spark.sql('''select count(*) from metric where class_label=1 and prediction=1''').head()[0]\n",
    "fp=spark.sql('''select count(*) from metric where class_label=0 and prediction=1''').head()[0]\n",
    "tn=spark.sql('''select count(*) from metric where class_label=0 and prediction=0''').head()[0]\n",
    "fn=spark.sql('''select count(*) from metric where class_label=1 and prediction=0''').head()[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "sensitivity=tp/(tp+fn)\n",
    "specificity=tn/(tn+fp)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.9852941176470589"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sensitivity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.825"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "specificity"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
